#install.packages(c("forecast", "expsmooth", "seasonal")) 
library(TTR)
library(forecast) 
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(tseries) 
library(expsmooth) 
library(fpp2)
## ── Attaching packages ────────────────────────────────────────────── fpp2 2.5 ──
## ✔ ggplot2 3.4.2     ✔ fma     2.5
## 
library(seasonal)
library(MASS)
## 
## Attaching package: 'MASS'
## The following objects are masked from 'package:fma':
## 
##     cement, housing, petrol
library(stats)
library(TSA)
## Registered S3 methods overwritten by 'TSA':
##   method       from    
##   fitted.Arima forecast
##   plot.Arima   forecast
## 
## Attaching package: 'TSA'
## The following objects are masked from 'package:stats':
## 
##     acf, arima
## The following object is masked from 'package:utils':
## 
##     tar
library(forecast)
library(ggplot2)
library(tseries)
library(imputeTS)
## 
## Attaching package: 'imputeTS'
## The following object is masked from 'package:tseries':
## 
##     na.remove
library(vars)
## Loading required package: strucchange
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following object is masked from 'package:imputeTS':
## 
##     na.locf
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
## Loading required package: urca
## Loading required package: lmtest
library(timetk)
library(Metrics)
## 
## Attaching package: 'Metrics'
## The following object is masked from 'package:forecast':
## 
##     accuracy
library(lmtest)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(xts)
library(dplyr)
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:xts':
## 
##     first, last
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite

Step 1: Load data and plot the time series

path <- "/Users/aashishsingh/Desktop/Time Series - Final Project/"
pm2_5_data <- read.csv(paste0(path, "la_pm2_5_pollutant_data.csv"), 
                     na.strings=c("", "NA"))
ozone_data <- read.csv(paste0(path, "la_ozone_pollutant_data.csv"), 
                     na.strings=c("", "NA"))

# Convert data into ts objects
pm2_5_ts <- ts(pm2_5_data$PM2.5.AQI.Value, start = c(1999,1,1), frequency = 365.25)
ozone_ts <- ts(ozone_data$Ozone.AQI.Value, start = c(1999,1,1), frequency = 365.25)

plot(pm2_5_ts, main="Los Angeles PM 2.5 AQI")

plot(ozone_ts, main="Los Angeles Ozone AQI")

Step 2: Check if data is stationary (KPSS/ADF Test)

kpss.test(pm2_5_ts)
## Warning in kpss.test(pm2_5_ts): p-value smaller than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  pm2_5_ts
## KPSS Level = 6.951, Truncation lag parameter = 12, p-value = 0.01
# The reported p-value is 0.01, which is smaller than 0.05, and would suggest 
# that we reject the null hypothesis of level stationarity and conclude that 
# there is evidence that the data is non-stationary

adf.test(pm2_5_ts)
## Warning in adf.test(pm2_5_ts): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  pm2_5_ts
## Dickey-Fuller = -14.554, Lag order = 20, p-value = 0.01
## alternative hypothesis: stationary
# Similarly the Augumented Dickey-Fuller test results in p-value of 0.01 (<0.05) 
# where we do reject the null hypothesis that the time series is non-stationary
# and thus data is stationary.

# These are opposite results so we use ACF PACF plots to see if the series is stationary
acf(pm2_5_ts, main = "ACF of Original Time Series")

pacf(pm2_5_ts, main = "PACF of Original Time Series")

Step 3: Extract weekly values for time series and plot the series

# Create weekly data
pm2_5_df <- as.data.frame(pm2_5_ts)
pm2_5_df$Date <- mdy(pm2_5_data$Date)
colnames(pm2_5_df) <- c("PM2.5.AQI.Value", "Date")

# Convert to xts format for weekly conversion
pm2_5_xts <- as.xts(pm2_5_df, order.by=pm2_5_df$Date)
pm2_5_xts <- pm2_5_xts[,-2]
pm2_5_weekly <- apply.weekly(pm2_5_xts, mean)
#pm2_5_weekly

pm2_5_weekly_ts <- ts(pm2_5_weekly["19990103/20230811"], start = c(1999,1),frequency = 52.18)
plot(pm2_5_weekly_ts)

kpss.test(pm2_5_weekly_ts)
## Warning in kpss.test(pm2_5_weekly_ts): p-value smaller than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  pm2_5_weekly_ts
## KPSS Level = 3.4904, Truncation lag parameter = 7, p-value = 0.01
adf.test(pm2_5_weekly_ts)
## Warning in adf.test(pm2_5_weekly_ts): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  pm2_5_weekly_ts
## Dickey-Fuller = -8.4636, Lag order = 10, p-value = 0.01
## alternative hypothesis: stationary
acf(pm2_5_weekly_ts, main = "ACF of Weekly Time Series")

pacf(pm2_5_weekly_ts, main = "PACF of Weekly Time Series")

Step 4: Extract monthly values for time series and plot the series

# Create monthly data
pm2_5_monthly <- apply.monthly(pm2_5_xts, mean)
#pm2_5_monthly

pm2_5_monthly_ts <- ts(pm2_5_monthly["19990131/20230811"], start = c(1999,1), frequency = 12)
plot(pm2_5_monthly_ts)

kpss.test(pm2_5_monthly_ts)
## Warning in kpss.test(pm2_5_monthly_ts): p-value smaller than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  pm2_5_monthly_ts
## KPSS Level = 1.9774, Truncation lag parameter = 5, p-value = 0.01
adf.test(pm2_5_monthly_ts)
## Warning in adf.test(pm2_5_monthly_ts): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  pm2_5_monthly_ts
## Dickey-Fuller = -7.4466, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
acf(pm2_5_monthly_ts, main = "ACF of Monthly Time Series")

pacf(pm2_5_monthly_ts, main = "PACF of Monthly Time Series")

Step 5: Understand time series variance

plot(pm2_5_weekly_ts)

# Strong seasonality, Strong cyclic, light downward trend, variance is reducing

# Let's see how seasonality looks over time and is the variance changing
pm2_5_df$Month <- month(pm2_5_df$Date)
pm2_5_df$Year <- year(pm2_5_df$Date)

avg_aqi_by_month_year <- pm2_5_df %>%
  group_by(pm2_5_df$Year, pm2_5_df$Month) %>%
  summarise(
    avg_value = mean(PM2.5.AQI.Value)
  )
## `summarise()` has grouped output by 'pm2_5_df$Year'. You can override using the
## `.groups` argument.
colnames(avg_aqi_by_month_year) <- c("Year", "Month", "avg_value")

reshape_avg_aqi_by_month_year <- 
  sqldf(
    "SELECT 
      Month,
      MAX(CASE WHEN Year = 1999 THEN avg_value END) AS Year_1999,
      MAX(CASE WHEN Year = 2000 THEN avg_value END) AS Year_2000,
      MAX(CASE WHEN Year = 2001 THEN avg_value END) AS Year_2001,
      MAX(CASE WHEN Year = 2002 THEN avg_value END) AS Year_2002,
      MAX(CASE WHEN Year = 2003 THEN avg_value END) AS Year_2003,
      MAX(CASE WHEN Year = 2004 THEN avg_value END) AS Year_2004,
      MAX(CASE WHEN Year = 2005 THEN avg_value END) AS Year_2005,
      MAX(CASE WHEN Year = 2006 THEN avg_value END) AS Year_2006,
      MAX(CASE WHEN Year = 2007 THEN avg_value END) AS Year_2007,
      MAX(CASE WHEN Year = 2008 THEN avg_value END) AS Year_2008,
      MAX(CASE WHEN Year = 2009 THEN avg_value END) AS Year_2009,
      MAX(CASE WHEN Year = 2010 THEN avg_value END) AS Year_2010,
      MAX(CASE WHEN Year = 2011 THEN avg_value END) AS Year_2011,
      MAX(CASE WHEN Year = 2012 THEN avg_value END) AS Year_2012,
      MAX(CASE WHEN Year = 2013 THEN avg_value END) AS Year_2013,
      MAX(CASE WHEN Year = 2014 THEN avg_value END) AS Year_2014,
      MAX(CASE WHEN Year = 2015 THEN avg_value END) AS Year_2015,
      MAX(CASE WHEN Year = 2016 THEN avg_value END) AS Year_2016,
      MAX(CASE WHEN Year = 2017 THEN avg_value END) AS Year_2017,
      MAX(CASE WHEN Year = 2018 THEN avg_value END) AS Year_2018,
      MAX(CASE WHEN Year = 2019 THEN avg_value END) AS Year_2019,
      MAX(CASE WHEN Year = 2020 THEN avg_value END) AS Year_2020,
      MAX(CASE WHEN Year = 2021 THEN avg_value END) AS Year_2021,
      MAX(CASE WHEN Year = 2022 THEN avg_value END) AS Year_2022,
      MAX(CASE WHEN Year = 2023 THEN avg_value END) AS Year_2023
    FROM avg_aqi_by_month_year
    GROUP BY Month
    ORDER BY Month"
  )

colnames(reshape_avg_aqi_by_month_year) <- c(
  "Month", "1999", "2000", "2001", "2002", "2003", "2004", "2005", "2006", 
  "2007", "2008", "2009", "2010", "2011", "2012", "2013", "2014", "2015", 
  "2016", "2017", "2018", "2019", "2020", "2021", "2022", "2023")

boxplot(reshape_avg_aqi_by_month_year[2:26])

# We can see that over the years there is downward trend and variance is decreasing
# except 2020 when we see a high peak likely caused by wildfires
# Link: https://en.wikipedia.org/wiki/Lake_Fire_(2020)

Step 6: Split data into train and test - Weekly

# Split the data into a training and test dataset
train <- window(pm2_5_weekly_ts, start = c(1999,1), end=c(2020,52))
test <- window(pm2_5_weekly_ts, start = c(2021,1), end=c(2023,7))

Step 7: Decompose the time series plot - Weekly

# Looking at spectral analysis
periodogram(train, log = "no", plot=TRUE, 
            ylab = "Periodogram",
            xlab = "Frequency",
            lwd=2, xlim = c(0, 0.06))

# There are two trends: 
#  1) A typical yearly (52 weeks) seasonal trend
#  2) A trend that is repeating every 9.6 years (500 weeks)
#  3) A typical half yearly (26 weeks) seasonal trend

# Overall, there is no mixed effect that normal seasonality model cannot capture.

# Decompose the series
plot(decompose(train, type="multiplicative"))

# Important Inferences
# 1) The PM2.5 AQI has been decreasing overall though there are rise every now and then.
#    However the trend is going down with time.
# 2) Winter months have a strong seasonal effect with Nov and Dec being the peak months
#    usually which likely could be due to cold temperatures causing pollutant to
#    not escape the lower atmosphere.
#    Link: https://www.accuweather.com/en/health-wellness/why-air-pollution-is-worse-in-winter/689434#:~:text=Cold%20air%20is%20denser%20and%20moves%20slower%20than%20warm%20air,rate%20than%20during%20the%20summer.
# 3) We can see a seasonal cycle of 12 months where the mean value of each month starts 
#    with a increasing trend in the beginning of the year and drops down towards 
#    the end of the year. We can see a seasonal effect with a cycle of 12 months.


# Understand the seasonality and remove it to see the trend
train_diff <- diff(train, lag = 52)
autoplot(train_diff, ylab="Train Seasonal Differencing")

# Let's look if the series is stationary?
kpss.test(train_diff)
## Warning in kpss.test(train_diff): p-value greater than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  train_diff
## KPSS Level = 0.17702, Truncation lag parameter = 7, p-value = 0.1
# The reported p-value is 0.1, which is > 0.05, and would suggest that we fail 
# to reject the null hypothesis of level stationarity (conclusion: stationary)

adf.test(train_diff)
## Warning in adf.test(train_diff): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  train_diff
## Dickey-Fuller = -8.5534, Lag order = 10, p-value = 0.01
## alternative hypothesis: stationary
# The reported p-value of 0.01 (<0.05) so we do reject the null hypothesis that 
# the time series is non-stationary (conclusion: stationary)

acf(train_diff, main = "ACF of Seasonal Differencing Time Series")

pacf(train_diff, main = "PACF of Seasonal Differencing Time Series")

Step 8: Benchmark using snaive model - Weekly

# Forecast the seasonal naïve for (2021-01 to 2023-07)
forecast_snaive <- snaive(train, h=110)
## Warning in lag.default(y, -lag): 'k' is not an integer
# Plot the forecasts for snaive model
plot(forecast_snaive, main = "PM 2.5 AQI Forecast - SNaive",
         xlab = "Week", ylab = "PM 2.5 AQI")
lines(test)

# Compare the forecast with the actual test data by calculating MAPE and MSE
# Mean Absolute Percentage Error (MAPE)
mape_snaive <- mean(abs((test - forecast_snaive$mean)/test))
mape_snaive
## [1] 0.2749545
# Mean Squared Error (MSE)
mse_snaive <- mean((test - forecast_snaive$mean)^2)
mse_snaive
## [1] 599.6278

Step 9: Split data into train and test - Monthly

# Split the data into a training and test dataset
train_monthly <- window(pm2_5_monthly_ts, start = c(1999,1), end=c(2020,12))
test_monthly <- window(pm2_5_monthly_ts, start = c(2021,1), end=c(2023,7))

Step 10: Decompose the time series plot - Monthly

# Looking at spectral analysis
periodogram(train_monthly, log = "no", plot=TRUE, 
            ylab = "Periodogram",
            xlab = "Frequency",
            lwd=2, xlim = c(0, 0.2))

# There are two trends: 
#  1) A typical yearly (12 months) seasonal trend
#  2) A trend that is repeating every 8-9 years (100 months)
#  3) A typical half yearly (6 months) seasonal trend

# Overall, there is no mixed effect that normal seasonality model cannot capture.

# Decompose the series
plot(decompose(train_monthly, type="multiplicative"))

# Important Inferences
# 1) The PM2.5 AQI has been decreasing overall though there are rise every now and then.
#    However the trend is going down with time.
# 2) Winter months have a strong seasonal effect with Nov and Dec being the peak months
#    usually which likely could be due to cold temperatures causing pollutant to
#    not escape the lower atmosphere.
#    Link: https://www.accuweather.com/en/health-wellness/why-air-pollution-is-worse-in-winter/689434#:~:text=Cold%20air%20is%20denser%20and%20moves%20slower%20than%20warm%20air,rate%20than%20during%20the%20summer.
# 3) We can see a seasonal cycle of 12 months where the mean value of each month starts 
#    with a increasing trend in the beginning of the year and drops down towards 
#    the end of the year. We can see a seasonal effect with a cycle of 12 months.


# Understand the seasonality and remove it to see the trend
train_monthly_diff <- diff(train_monthly, lag = 12)
autoplot(train_monthly_diff, ylab="Train Seasonal Differencing (Monthly)")

# Let's look if the series is stationary?
kpss.test(train_monthly_diff)
## Warning in kpss.test(train_monthly_diff): p-value greater than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  train_monthly_diff
## KPSS Level = 0.14573, Truncation lag parameter = 5, p-value = 0.1
# The reported p-value is 0.1, which is > 0.05, and would suggest that we fail 
# to reject the null hypothesis of level stationarity (conclusion: stationary)

adf.test(train_monthly_diff)
## Warning in adf.test(train_monthly_diff): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  train_monthly_diff
## Dickey-Fuller = -5.1056, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
# The reported p-value of 0.01 (<0.05) so we do reject the null hypothesis that 
# the time series is non-stationary (conclusion: stationary)

acf(train_monthly_diff, main = "ACF of Seasonal Differencing Time Series")

pacf(train_monthly_diff, main = "PACF of Seasonal Differencing Time Series")

Step 11: Benchmark using snaive model - Monthly

# Forecast the seasonal naïve for (2021-01 to 2023-07)
forecast_snaive_monthly <- snaive(train_monthly, h=31)
# Plot the forecasts for snaive model
plot(forecast_snaive_monthly, main = "PM 2.5 AQI Forecast - SNaive",
         xlab = "Week", ylab = "PM 2.5 AQI")
lines(test_monthly)

# Compare the forecast with the actual test data by calculating MAPE and MSE
# Mean Absolute Percentage Error (MAPE)
mape_snaive <- mean(abs((test_monthly - forecast_snaive_monthly$mean)/test_monthly))
mape_snaive
## [1] 0.20778
# Mean Squared Error (MSE)
mse_snaive <- mean((test_monthly - forecast_snaive_monthly$mean)^2)
mse_snaive
## [1] 282.8901